library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.5
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(pwr)
library(knitr)
library(kableExtra)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(extrafont)
## Registering fonts with R
library(ggrepel)
First we’ll do an F-test for equal variances to see if we can overfide the default Welch approximation.
# F-test for equal variances
#
chickwts_f <- chickwts %>%
filter(feed == "horsebean" | feed == "linseed") %>%
var.test(weight ~ feed, data = .)
chickwts_f
##
## F test to compare two variances
##
## data: weight by feed
## F = 0.54679, num df = 9, denom df = 11, p-value = 0.3739
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.1523986 2.1390857
## sample estimates:
## ratio of variances
## 0.5467906
# Retain the null hypothesis variances are equal
# override the default in t.test (var.equal = FALSE)
# t.test()
chickwts_t <- chickwts %>%
filter(feed == "horsebean" | feed == "linseed") %>%
t.test(weight ~ feed, data = ., var.equal = TRUE)
chickwts_t
##
## Two Sample t-test
##
## data: weight by feed
## t = -2.934, df = 20, p-value = 0.008205
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -100.17618 -16.92382
## sample estimates:
## mean in group horsebean mean in group linseed
## 160.20 218.75
## OR:
#chick_simple <- chickwts %>%
# var.test(weight ~ feed, data = .) # something wrong with the lines
#t_test <- chick_simple %>%
# t.test(weight ~ feed, data = ., var.equal = TRUE)
#t_test
chick_simple <- chickwts %>%
filter(feed == "horsebean" | feed == "linseed")
f_test <- chick_simple %>%
var.test(weight ~ feed, data = .)
t_test <- chick_simple %>%
t.test(weight ~ feed, data = .)
t_test
##
## Welch Two Sample t-test
##
## data: weight by feed
## t = -3.0172, df = 19.769, p-value = 0.006869
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -99.0597 -18.0403
## sample estimates:
## mean in group horsebean mean in group linseed
## 160.20 218.75
p value means, if there is no difference between the two population, that the probability of the two mean are different is p.
Mean weights of chickens fed horsebean (mean sd) and linseed (mean sd) differ significantly (t(20) = r round(chickwts_t$statistic,2), p = 0.008, \(\alpha\)=0.05).
We need to collect samples to test a hypothesis (planning on doing a 2-sample t-test) to see if there is a difference in means for phosphate concentrations in lagoons downstream from golf courses of NOT downstream from golf courses.
A priori* power analysis: give you an estimate of how many observations you’ll need to collect (really useful for justifying budgets, etc. for proposals)
sig.level: alpha _ d: effect size (Cohen’s d)
Find n for “small” effect size (~0.2), and a “large” effect size (~0.8) as endpoints for n estimations
# A priori power analysis
# How many samples needed if there is a SMALL effect size, and alpha = 0.05. power = 0.80
power_small <- pwr.t.test(n = NULL, power = 0.8, sig.level = 0.05, d = 0.2)
power_small
##
## Two-sample t test power calculation
##
## n = 393.4057
## d = 0.2
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
power_large <- pwr.t.test(n = NULL, power = 0.8, sig.level = 0.05, d = 0.8)
power_large
##
## Two-sample t test power calculation
##
## n = 25.52458
## d = 0.8
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
mortality <- read_csv("drug_mortality.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## sex = col_character(),
## ages = col_character(),
## race_and_hispanic_origin = col_character(),
## state = col_character(),
## deaths = col_integer(),
## population = col_integer(),
## death_rate = col_double()
## )
income <- read_csv("state_income.csv")
## Parsed with column specification:
## cols(
## state = col_character(),
## med_income = col_integer()
## )
mortality_1 <- mortality %>%
filter(ages == "All Ages",
sex == "Both Sexes",
race_and_hispanic_origin == "All Races-All Origins",
state != "United States") %>%
select(year, state:death_rate)
m15 <- mortality_1 %>%
filter(year == 2015) %>%
mutate(highlight = ifelse(state == "Kentucky", "Yes", "No")) %>%
arrange(-death_rate) %>%
head(10) %>%
ggplot(aes(x = reorder(state, -death_rate), y = death_rate))+
geom_col(aes(fill = highlight))+
scale_fill_manual(values = c("gray60", "red"))
m15
ggplotly(m15)
# graph_1
We’ll use full_join (dyplyr) - this means that you retain ALL observations, even if the level only exists in one of the data frames
View(mortality_1)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/modules/R_de.so'' had status 1
mi_2015 <- full_join(mortality_1, income) %>%
filter(year == 2015)
## Joining, by = "state"
graph_3 <- ggplot(filter(mi_2015, med_income > 60000),
aes(x = med_income,
y = death_rate,
label = state))+
geom_point()+
geom_text_repel()
graph_3 <- ggplot(filter(mi_2015),
aes(x = med_income,
y = death_rate,
label = state))+
geom_point(aes(size = population, color = state), alpha = 0.4)+
geom_text_repel(size = 3)+
theme(legend.position = "NA")
graph_3
ggplotly(graph_3)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
ca_table <- mortality_1 %>%
filter(year >= 2010,
state == "California")
ca_table
## # A tibble: 6 x 5
## year state deaths population death_rate
## <int> <chr> <int> <int> <dbl>
## 1 2010 California 4057 37253956 10.9
## 2 2011 California 4180 37691912 11.1
## 3 2012 California 4040 38041430 10.6
## 4 2013 California 4452 38332521 11.6
## 5 2014 California 4521 38802500 11.7
## 6 2015 California 4659 39144818 11.9
ca_final <- kable(ca_table) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
ca_final
| year | state | deaths | population | death_rate |
|---|---|---|---|---|
| 2010 | California | 4057 | 37253956 | 10.9 |
| 2011 | California | 4180 | 37691912 | 11.1 |
| 2012 | California | 4040 | 38041430 | 10.6 |
| 2013 | California | 4452 | 38332521 | 11.6 |
| 2014 | California | 4521 | 38802500 | 11.7 |
| 2015 | California | 4659 | 39144818 | 11.9 |